NYC Taxi

library(arrow)
## 
## Attaching package: 'arrow'
## The following object is masked from 'package:utils':
## 
##     timestamp
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

TLC Trip Record Data

tlc <- read_parquet(file = "~/Documents/data/yellow_tripdata_2022-06.parquet")
str(tlc)
## tibble [3,558,124 × 19] (S3: tbl_df/tbl/data.frame)
##  $ VendorID             : int [1:3558124] 1 1 2 1 1 2 2 1 2 2 ...
##  $ tpep_pickup_datetime : POSIXct[1:3558124], format: "2022-05-31 20:25:41" "2022-05-31 20:44:40" ...
##  $ tpep_dropoff_datetime: POSIXct[1:3558124], format: "2022-05-31 20:48:22" "2022-05-31 21:01:48" ...
##  $ passenger_count      : num [1:3558124] 1 1 1 2 0 1 1 1 1 1 ...
##  $ trip_distance        : num [1:3558124] 11 4.2 9.49 12.1 1.8 2.02 8.08 4.3 8.78 1.76 ...
##  $ RatecodeID           : num [1:3558124] 1 1 1 1 1 1 1 1 1 1 ...
##  $ store_and_fwd_flag   : chr [1:3558124] "N" "N" "N" "N" ...
##  $ PULocationID         : int [1:3558124] 70 170 264 132 140 148 158 246 197 48 ...
##  $ DOLocationID         : int [1:3558124] 48 226 113 17 163 158 116 262 191 186 ...
##  $ payment_type         : int [1:3558124] 1 1 1 2 1 1 1 1 1 1 ...
##  $ fare_amount          : num [1:3558124] 32 14 26 37 9 9 26.5 15 26.5 7.5 ...
##  $ extra                : num [1:3558124] 3 3 0.5 1.75 3 0.5 0.5 3 0.5 0.5 ...
##  $ mta_tax              : num [1:3558124] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ tip_amount           : num [1:3558124] 2 0 5 0 2.55 0.64 7.58 3.75 5.56 2.26 ...
##  $ tolls_amount         : num [1:3558124] 6.55 0 6.55 0 0 0 0 0 0 0 ...
##  $ improvement_surcharge: num [1:3558124] 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 ...
##  $ total_amount         : num [1:3558124] 44.4 17.8 42.6 39.5 15.3 ...
##  $ congestion_surcharge : num [1:3558124] 2.5 2.5 2.5 0 2.5 2.5 2.5 2.5 0 2.5 ...
##  $ airport_fee          : num [1:3558124] 0 0 1.25 1.25 0 0 0 0 0 0 ...
library(lubridate)
## Loading required package: timechange
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:arrow':
## 
##     duration
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
tlc <- tlc %>% 
  mutate(
    date_pickup = date(tpep_pickup_datetime),
    date_dropoff = date(tpep_dropoff_datetime),
    hour_pickup = hours(tpep_pickup_datetime), 
    hour_dropoff = hours(tpep_dropoff_datetime),
    weekday_pickup = wday(tpep_pickup_datetime, label = TRUE),
    day_pickup = day(tpep_pickup_datetime)
    )
tlc %>% group_by(weekday_pickup) %>% 
  tally %>% 
  ggplot(aes(x = weekday_pickup, y = n)) + geom_bar(stat = "identity") 

tlc %>% group_by(weekday_pickup) %>% 
  summarise(mean_passanger = mean(passenger_count, na.rm = TRUE)) %>% 
  ggplot(aes(x = weekday_pickup, y = mean_passanger)) + geom_bar(stat = "identity") 

tlc %>% group_by(day_pickup) %>% 
  tally %>% 
  ggplot(aes(x = day_pickup, y = n)) + geom_line()

Geographic

library(sf)
## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
tlc_zone <- st_read("~/Documents/taxi_zones/taxi_zones.shp", quiet = TRUE)

plot(tlc_zone)

tlc_zone <- st_transform(tlc_zone, crs = 4326)
 ggplot(tlc_zone) + geom_sf() + theme_inset()

our_neighborhood <- tlc_zone %>% 
  filter(zone == "Gramercy"|zone == "Kips Bay")

ggplot(tlc_zone) + geom_sf() + theme_inset() +
  geom_sf(data = our_neighborhood, fill = "red")

bbox <- st_bbox(tlc_zone) %>% as.numeric

nyc_map <- get_stamenmap(bbox = bbox, messaging = FALSE, zoom = 11, 
                         maptype = "toner-lite", format = c("png"))
## Source : http://tile.stamen.com/toner-lite/11/601/768.png
## Source : http://tile.stamen.com/toner-lite/11/602/768.png
## Source : http://tile.stamen.com/toner-lite/11/603/768.png
## Source : http://tile.stamen.com/toner-lite/11/604/768.png
## Source : http://tile.stamen.com/toner-lite/11/601/769.png
## Source : http://tile.stamen.com/toner-lite/11/602/769.png
## Source : http://tile.stamen.com/toner-lite/11/603/769.png
## Source : http://tile.stamen.com/toner-lite/11/604/769.png
## Source : http://tile.stamen.com/toner-lite/11/601/770.png
## Source : http://tile.stamen.com/toner-lite/11/602/770.png
## Source : http://tile.stamen.com/toner-lite/11/603/770.png
## Source : http://tile.stamen.com/toner-lite/11/604/770.png
## Source : http://tile.stamen.com/toner-lite/11/601/771.png
## Source : http://tile.stamen.com/toner-lite/11/602/771.png
## Source : http://tile.stamen.com/toner-lite/11/603/771.png
## Source : http://tile.stamen.com/toner-lite/11/604/771.png
ggmap(nyc_map) + 
  geom_sf(data = our_neighborhood, fill = "red", inherit.aes = FALSE)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

#inherit.aes to use coordinates from data table, not nyc_map

ggmap(nyc_map) + geom_sf(data = joined_tbl, aes(fill = N), inherit.aes = FALSE) +
  scale_fill_viridis_c(option = "A")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.